Differential expression analysis of LCM RNA Data

Managing Packages Using Renv

To run this code in my project using the renv environment, run the following lines of code

install.packages("renv") #install the package on the new computer (may not be necessary if renv bootstraps itself as expected)
renv::restore() #reinstall all the package versions in the renv lockfile

Install packages

packages <- c("genefilter", "DESeq2", "apeglm", "ashr", "ggplot2", "vsn", 
              "hexbin", "pheatmap", "RColorBrewer", "EnhancedVolcano", "tidyverse")

lapply(packages, library, character.only = TRUE)
## Warning: package 'DESeq2' was built under R version 4.3.3
## Warning: package 'matrixStats' was built under R version 4.3.3
## Warning: package 'hexbin' was built under R version 4.3.3
## Warning: package 'ggrepel' was built under R version 4.3.3
## [[1]]
## [1] "genefilter" "stats"      "graphics"   "grDevices"  "datasets"  
## [6] "utils"      "methods"    "base"      
## 
## [[2]]
##  [1] "DESeq2"               "SummarizedExperiment" "Biobase"             
##  [4] "MatrixGenerics"       "matrixStats"          "GenomicRanges"       
##  [7] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [10] "BiocGenerics"         "stats4"               "genefilter"          
## [13] "stats"                "graphics"             "grDevices"           
## [16] "datasets"             "utils"                "methods"             
## [19] "base"                
## 
## [[3]]
##  [1] "apeglm"               "DESeq2"               "SummarizedExperiment"
##  [4] "Biobase"              "MatrixGenerics"       "matrixStats"         
##  [7] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [10] "S4Vectors"            "BiocGenerics"         "stats4"              
## [13] "genefilter"           "stats"                "graphics"            
## [16] "grDevices"            "datasets"             "utils"               
## [19] "methods"              "base"                
## 
## [[4]]
##  [1] "ashr"                 "apeglm"               "DESeq2"              
##  [4] "SummarizedExperiment" "Biobase"              "MatrixGenerics"      
##  [7] "matrixStats"          "GenomicRanges"        "GenomeInfoDb"        
## [10] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [13] "stats4"               "genefilter"           "stats"               
## [16] "graphics"             "grDevices"            "datasets"            
## [19] "utils"                "methods"              "base"                
## 
## [[5]]
##  [1] "ggplot2"              "ashr"                 "apeglm"              
##  [4] "DESeq2"               "SummarizedExperiment" "Biobase"             
##  [7] "MatrixGenerics"       "matrixStats"          "GenomicRanges"       
## [10] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [13] "BiocGenerics"         "stats4"               "genefilter"          
## [16] "stats"                "graphics"             "grDevices"           
## [19] "datasets"             "utils"                "methods"             
## [22] "base"                
## 
## [[6]]
##  [1] "vsn"                  "ggplot2"              "ashr"                
##  [4] "apeglm"               "DESeq2"               "SummarizedExperiment"
##  [7] "Biobase"              "MatrixGenerics"       "matrixStats"         
## [10] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [13] "S4Vectors"            "BiocGenerics"         "stats4"              
## [16] "genefilter"           "stats"                "graphics"            
## [19] "grDevices"            "datasets"             "utils"               
## [22] "methods"              "base"                
## 
## [[7]]
##  [1] "hexbin"               "vsn"                  "ggplot2"             
##  [4] "ashr"                 "apeglm"               "DESeq2"              
##  [7] "SummarizedExperiment" "Biobase"              "MatrixGenerics"      
## [10] "matrixStats"          "GenomicRanges"        "GenomeInfoDb"        
## [13] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [16] "stats4"               "genefilter"           "stats"               
## [19] "graphics"             "grDevices"            "datasets"            
## [22] "utils"                "methods"              "base"                
## 
## [[8]]
##  [1] "pheatmap"             "hexbin"               "vsn"                 
##  [4] "ggplot2"              "ashr"                 "apeglm"              
##  [7] "DESeq2"               "SummarizedExperiment" "Biobase"             
## [10] "MatrixGenerics"       "matrixStats"          "GenomicRanges"       
## [13] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [16] "BiocGenerics"         "stats4"               "genefilter"          
## [19] "stats"                "graphics"             "grDevices"           
## [22] "datasets"             "utils"                "methods"             
## [25] "base"                
## 
## [[9]]
##  [1] "RColorBrewer"         "pheatmap"             "hexbin"              
##  [4] "vsn"                  "ggplot2"              "ashr"                
##  [7] "apeglm"               "DESeq2"               "SummarizedExperiment"
## [10] "Biobase"              "MatrixGenerics"       "matrixStats"         
## [13] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [16] "S4Vectors"            "BiocGenerics"         "stats4"              
## [19] "genefilter"           "stats"                "graphics"            
## [22] "grDevices"            "datasets"             "utils"               
## [25] "methods"              "base"                
## 
## [[10]]
##  [1] "EnhancedVolcano"      "ggrepel"              "RColorBrewer"        
##  [4] "pheatmap"             "hexbin"               "vsn"                 
##  [7] "ggplot2"              "ashr"                 "apeglm"              
## [10] "DESeq2"               "SummarizedExperiment" "Biobase"             
## [13] "MatrixGenerics"       "matrixStats"          "GenomicRanges"       
## [16] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [19] "BiocGenerics"         "stats4"               "genefilter"          
## [22] "stats"                "graphics"             "grDevices"           
## [25] "datasets"             "utils"                "methods"             
## [28] "base"                
## 
## [[11]]
##  [1] "lubridate"            "forcats"              "stringr"             
##  [4] "dplyr"                "purrr"                "readr"               
##  [7] "tidyr"                "tibble"               "tidyverse"           
## [10] "EnhancedVolcano"      "ggrepel"              "RColorBrewer"        
## [13] "pheatmap"             "hexbin"               "vsn"                 
## [16] "ggplot2"              "ashr"                 "apeglm"              
## [19] "DESeq2"               "SummarizedExperiment" "Biobase"             
## [22] "MatrixGenerics"       "matrixStats"          "GenomicRanges"       
## [25] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [28] "BiocGenerics"         "stats4"               "genefilter"          
## [31] "stats"                "graphics"             "grDevices"           
## [34] "datasets"             "utils"                "methods"             
## [37] "base"
sessionInfo() #provides list of loaded packages and version of R.
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] lubridate_1.9.3             forcats_1.0.0              
##  [3] stringr_1.5.1               dplyr_1.1.4                
##  [5] purrr_1.0.2                 readr_2.1.5                
##  [7] tidyr_1.3.1                 tibble_3.2.1               
##  [9] tidyverse_2.0.0             EnhancedVolcano_1.20.0     
## [11] ggrepel_0.9.6               RColorBrewer_1.1-3         
## [13] pheatmap_1.0.12             hexbin_1.28.4              
## [15] vsn_3.70.0                  ggplot2_3.5.1              
## [17] ashr_2.2-63                 apeglm_1.24.0              
## [19] DESeq2_1.42.1               SummarizedExperiment_1.30.2
## [21] Biobase_2.60.0              MatrixGenerics_1.12.3      
## [23] matrixStats_1.4.1           GenomicRanges_1.52.1       
## [25] GenomeInfoDb_1.36.4         IRanges_2.34.1             
## [27] S4Vectors_0.38.2            BiocGenerics_0.46.0        
## [29] genefilter_1.84.0          
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.2.3               bitops_1.0-8            rlang_1.1.4            
##  [4] magrittr_2.0.3          compiler_4.3.2          RSQLite_2.3.7          
##  [7] png_0.1-8               vctrs_0.6.5             pkgconfig_2.0.3        
## [10] crayon_1.5.3            fastmap_1.2.0           XVector_0.40.0         
## [13] utf8_1.2.4              rmarkdown_2.28          tzdb_0.4.0             
## [16] preprocessCore_1.64.0   bit_4.5.0               xfun_0.47              
## [19] zlibbioc_1.46.0         cachem_1.1.0            jsonlite_1.8.9         
## [22] blob_1.2.4              DelayedArray_0.26.7     BiocParallel_1.34.2    
## [25] irlba_2.3.5.1           parallel_4.3.2          R6_2.5.1               
## [28] stringi_1.8.4           bslib_0.8.0             limma_3.58.1           
## [31] SQUAREM_2021.1          jquerylib_0.1.4         numDeriv_2016.8-1.1    
## [34] Rcpp_1.0.13             knitr_1.48              timechange_0.3.0       
## [37] Matrix_1.6-5            splines_4.3.2           tidyselect_1.2.1       
## [40] rstudioapi_0.16.0       abind_1.4-8             yaml_2.3.10            
## [43] codetools_0.2-20        affy_1.80.0             lattice_0.22-6         
## [46] plyr_1.8.9              withr_3.0.1             KEGGREST_1.40.1        
## [49] coda_0.19-4.1           evaluate_1.0.0          survival_3.7-0         
## [52] Biostrings_2.68.1       pillar_1.9.0            affyio_1.72.0          
## [55] BiocManager_1.30.25     renv_1.0.10             generics_0.1.3         
## [58] invgamma_1.1            RCurl_1.98-1.16         truncnorm_1.0-9        
## [61] emdbook_1.3.13          hms_1.1.3               munsell_0.5.1          
## [64] scales_1.3.0            xtable_1.8-4            glue_1.8.0             
## [67] tools_4.3.2             annotate_1.78.0         locfit_1.5-9.10        
## [70] mvtnorm_1.3-1           XML_3.99-0.17           grid_4.3.2             
## [73] bbmle_1.0.25.1          bdsmatrix_1.3-7         AnnotationDbi_1.62.2   
## [76] colorspace_2.1-1        GenomeInfoDbData_1.2.10 cli_3.6.3              
## [79] fansi_1.0.6             mixsqp_0.3-54           S4Arrays_1.0.6         
## [82] gtable_0.3.5            sass_0.4.9              digest_0.6.37          
## [85] memoise_2.0.1           htmltools_0.5.8.1       lifecycle_1.0.4        
## [88] httr_1.4.7              statmod_1.5.0           bit64_4.5.2            
## [91] MASS_7.3-60.0.1
save_ggplot <- function(plot, filename, width = 10, height = 7, units = "in", dpi = 300) {
  # Display plot
  print(plot)
  
  # Save plot
  ggsave(filename = paste0(filename, ".png"), plot = plot, width = width, height = height, units = units, dpi = dpi)
}

Read and clean count matrix and metadata

Read in raw count data

counts_raw <- read.csv("../output_RNA/stringtie/LCM_RNA_gene_count_matrix.csv", row.names = 1) #load in data

gene_id,LCM_15,LCM_16,LCM_20,LCM_21,LCM_26,LCM_27,LCM_4,LCM_5,LCM_8,LCM_9

Read in metadata

meta <- read.csv("../data_RNA/LCM_RNA_metadata.csv") %>%
            dplyr::arrange(Sample) %>%
            mutate(across(c(Tissue, Fragment, Section_Date, LCM_Date), factor)) # Set variables as factors 

meta$Tissue <- factor(meta$Tissue, levels = c("OralEpi","Aboral")) #we want OralEpi to be the baseline

Data sanity checks!

all(meta$Sample %in% colnames(counts_raw)) #are all of the sample names in the metadata column names in the gene count matrix? Should be TRUE
## [1] TRUE
all(meta$Sample == colnames(counts_raw)) #are they the same in the same order? Should be TRUE
## [1] TRUE

pOverA filtering to reduce dataset

ffun<-filterfun(pOverA(0.25,5))  #Keep genes expressed in at least 25% of samples
counts_filt_poa <- genefilter((counts_raw), ffun) #apply filter

filtered_counts <- counts_raw[counts_filt_poa,] #keep only rows that passed filter

cat("Number of genes after filtering:", sum(counts_filt_poa))
## Number of genes after filtering: 15530

There are now 15530 genes in the filtered dataset.

Data sanity checks:

all(meta$Sample %in% colnames(filtered_counts)) #are all of the sample names in the metadata column names in the gene count matrix?
## [1] TRUE
all(meta$Sample == colnames(filtered_counts))  #are they the same in the same order? Should be TRUE
## [1] TRUE

DESeq2

Create DESeq object and run DESeq2

dds <- DESeqDataSetFromMatrix(countData = filtered_counts,
                              colData = meta,
                              design= ~ Fragment + Tissue)

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Extract results for Aboral vs. OralEpi contrast

res <- results(dds, contrast = c("Tissue","Aboral","OralEpi"))
resLFC <- lfcShrink(dds, coef="Tissue_Aboral_vs_OralEpi", res=res, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## Warning in nbinomGLM(x = x, Y = YNZ, size = size, weights = weightsNZ, offset =
## offsetNZ, : the moving direction increases the objective function value

Extract results for adjusted p-value < 0.05

res <- resLFC

resOrdered <- res[order(res$pvalue),]# save differentially expressed genes

DE_05 <- as.data.frame(resOrdered) %>% filter(padj < 0.05)
DE_05_Up <- DE_05 %>% filter(log2FoldChange > 0) #Higher in Aboral, Lower in OralEpi
DE_05_Down <- DE_05 %>% filter(log2FoldChange < 0) #Lower in Aboral, Higher in OralEpi

nrow(DE_05)
## [1] 4177
nrow(DE_05_Up) #Higher in Aboral, Lower in OralEpi
## [1] 649
nrow(DE_05_Down) #Lower in Aboral, Higher in OralEpi
## [1] 3528
write.csv(as.data.frame(resOrdered), 
          file="../output_RNA/differential_expression/DESeq_results.csv")

write.csv(DE_05, 
          file="../output_RNA/differential_expression/DEG_05.csv")

Visualizing Differential Expression

EnhancedVolcano(resLFC, 
    lab = rownames(resLFC),
    x = 'log2FoldChange',
    y = 'pvalue')

Plots

plotMA(results(dds, contrast = c("Tissue","Aboral","OralEpi")), ylim=c(-20,20))

plotMA(resLFC, ylim=c(-20,20))

Log2 Fold Change Comparison

# because we are interested in the comparison and not the intercept, we set 'coef=2'
resNorm <- lfcShrink(dds, coef=6, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=6, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-5,5)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

plotCounts(dds, gene=which.max(res$log2FoldChange), intgroup="Tissue")

plotCounts(dds, gene=which.min(res$log2FoldChange), intgroup="Tissue")

Transforming count data for visualization

vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
ntd <- normTransform(dds) # this gives log2(n + 1)

meanSdPlot(assay(vsd), main = "vsd")
## Warning in geom_hex(bins = bins, ...): Ignoring unknown parameters: `main`

meanSdPlot(assay(rld))

meanSdPlot(assay(ntd))

Will move forward with vst transformation for visualizations

Heatmap of count matrix

df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

#view all genes
pheatmap(assay(vsd), cluster_rows=TRUE, show_rownames=FALSE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)

#view highest count genes
select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)

#view most significantly differentially expressed genes

select <- order(res$padj)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df)

Heatmap of the sample-to-sample distances

sampleDists <- dist(t(assay(vsd)))

sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Tissue, vsd$Fragment, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

Principal component plot of the samples

pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
PCA <- ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

save_ggplot(PCA, "../output_RNA/differential_expression/PCA")

Clearly, the majority of the variance in the data is explained by tissue type!

Annotation data

Download annotation files from genome website


# wget files
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.EggNog_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.KEGG_results.txt.gz

# move to references direcotry
mv *gz ../references

# unzip files
gunzip ../references/*gz
EggNog <- read.delim("../references/Pocillopora_acuta_HIv2.genes.EggNog_results.txt") %>% dplyr::rename("query" = X.query)

CDSearch <- read.delim("../references/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt", quote = "") %>% dplyr::rename("query" = X.Query)

KEGG <- read.delim("../references/Pocillopora_acuta_HIv2.genes.KEGG_results.txt", header = FALSE) %>% dplyr::rename("query" = V1, "KeggTerm" = V2)
DE_05$query <- rownames(DE_05)
DE_05_annot <- DE_05 %>% left_join(CDSearch) %>% select(query,everything())
## Joining with `by = join_by(query)`
DE_05_eggnog <- DE_05 %>% left_join(EggNog) %>% select(query,everything())
## Joining with `by = join_by(query)`
annot_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(CDSearch)
## Joining with `by = join_by(query)`
eggnog_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(EggNog)
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_eggnog), 
          file="../output_RNA/differential_expression/DE_05_eggnog_annotation.csv")
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
gene_labels <- eggnog_all %>% select(query,PFAMs) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) %>% #replace NAs with "" for labelling purposes
  separate(PFAMs, into = c("PFAMs", "rest of name"), sep = ",(?=.*?,)", extra = "merge")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 13646 rows [1, 2, 3, 4,
## 5, 7, 8, 10, 11, 12, 13, 14, 15, 18, 19, 21, 22, 23, 24, 25, ...].
#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]

top50_DE <- pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_DE")

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

up_Aboral <- pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "../output_RNA/differential_expression/up_Aboral")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

up_OralEpi <- pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "../output_RNA/differential_expression/up_OralEpi")

Genes of Interest

MarkerGenes <- read.csv("../references/Pacuta_MarkerGenes_Levy2021.csv") %>% dplyr::rename("query" = 1, "List" = 2, "definition" = 3)

MarkerGenes$def_short <- ifelse(nchar(MarkerGenes$definition) > 20, 
                            paste0(substr(MarkerGenes$definition, 1, 17), "..."), 
                            MarkerGenes$definition)
Biomin <- read.csv("../references/Pacuta_Biomin.csv") %>% dplyr::rename("query" = Pocillopora_acuta_best_hit) %>% select(-c(accessionnumber.geneID, Ref))

Biomin <- Biomin %>%
  group_by(query,List) %>%
  summarize(definition = paste(unique(definition), collapse = ","))
## `summarise()` has grouped output by 'query'. You can override using the
## `.groups` argument.
Biomin$def_short <- ifelse(nchar(Biomin$definition) > 40, 
                            paste0(substr(Biomin$definition, 1, 37), "..."), 
                            Biomin$definition)
DE_05$query <- rownames(DE_05)
DE_05_biomin <- DE_05 %>% left_join(Biomin) %>% select(query,everything()) %>% drop_na()
## Joining with `by = join_by(query)`
DE_05_marker <- DE_05 %>% left_join(MarkerGenes) %>% select(query,everything())  %>% drop_na()
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_marker), 
          file="../output_RNA/differential_expression/DE_05_markergene_annotation.csv")

biomin_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(Biomin) 
## Joining with `by = join_by(query)`
biomin_all_res <- as.data.frame(resLFC) %>% mutate(query = rownames(resLFC)) %>% select(query,everything()) %>% left_join(Biomin) 
## Joining with `by = join_by(query)`
markers_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(MarkerGenes) 
## Joining with `by = join_by(query)`
markers_all_res <- as.data.frame(resLFC) %>% mutate(query = rownames(resLFC)) %>% select(query,everything()) %>% left_join(MarkerGenes) 
## Joining with `by = join_by(query)`
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

#view biomin genes that are differntially expressed

DE_biomin <- pheatmap(assay(vsd)[DE_05_biomin$query,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
         labels_row = DE_05_biomin$def_short, fontsize_row = 5)
DE_biomin
save_ggplot(DE_biomin, "../output_RNA/differential_expression/DE_biomin")

#view marker genes that are differntially expressed

DE_marker <- pheatmap(assay(vsd)[DE_05_marker$query,], cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,cutree_rows = 5,annotation_col=df,
         labels_row = DE_05_marker$List, fontsize_row = 4)
DE_marker
save_ggplot(DE_marker, "../output_RNA/differential_expression/DE_marker")

DE_05_marker_grouped <- DE_05_marker %>% arrange(List) %>% mutate(List = as.factor(List))
DE_05_marker_grouped_plot <- pheatmap(assay(vsd)[DE_05_marker_grouped$query,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=df,
         labels_row = DE_05_marker_grouped$List, fontsize_row = 5)

DE_05_marker_grouped_plot
save_ggplot(DE_05_marker_grouped_plot, "../output_RNA/differential_expression/DE_05_marker_grouped")

Visualizing Differential Expression

Biomin_volcano <- EnhancedVolcano(biomin_all_res, 
    lab = biomin_all_res$def_short,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE)

save_ggplot(Biomin_volcano, "../output_RNA/differential_expression/Biomin_volcano")

Marker_volcano <- EnhancedVolcano(markers_all_res, 
    lab = markers_all_res$List,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE)

save_ggplot(Marker_volcano, "../output_RNA/differential_expression/Marker_volcano")

Marker_volcano_names <- EnhancedVolcano(markers_all_res, 
    lab = markers_all_res$def_short,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 20)

save_ggplot(Marker_volcano_names, "../output_RNA/differential_expression/Marker_volcano_names")

we probably want to correct for batch effects (LCM date) + genotype (Fragment)

and to decide which tranformation to use

Updating Renv environment:

After you’ve confirmed your code works as expected, use renv::snapshot() to record the packages and their sources in the lockfile.”